import os
import json
from sklearn.model_selection import train_test_split
from pymongo import MongoClient
import pymongo
client = MongoClient(port=27017)
db=client['tiktok']
obj={
    '_id':'',
    'text_feature':{
        'text':'',
        'stickerText':[],
    },
    'video_feature':{
        'text_embed':[],
        'img_embed':{},
        'audio':{
            'yamnet':[]
        },
        'editing':[],
        'label':{},
        'residual':{}
    },
    'img_feature':{

    }
}
ht=['artmas','workingathome','carsoftiktok','wildanimals','lunarnewyear']
outdict={}
labels={}
for hashtag in ht[:2]:
    with open('D:\\Work\\kusuri\\data_pi\\labels_'+hashtag+'.txt', 'r', encoding='utf-8') as finl:
        for line in finl:
            temp=line.strip().split('\t')
            labels[temp[0]]=[int(temp[1]),int(temp[2])]
with open('D:\\Work\\kusuri\\data_pi\\labels_r1.txt', 'r', encoding='utf-8') as finl:
    for line in finl:
        temp=line.strip().split('\t')
        labels[temp[0]]=[int(temp[1]),int(temp[2])]
for hashtag in ht:
    i=0
    hashtag=hashtag.replace('#','').lower()
    for fname in os.listdir('D:\\Work\\Tool\\tiktok\\TikToks\\'):
        if hashtag in fname.lower():
            with open('D:\\Work\\Tool\\tiktok\\TikToks\\'+fname, 'r', encoding='utf-8', newline='\n') as filename_input:
                ranking=0
                for line in filename_input:
                    if i>=1000:
                        break
                    z = json.loads(line)
                    text=''
                    stext=[]
                    id=''
                    if 'id' in z.keys():
                        id = z['id']

                        if 'stickerTextList' in z.keys():
                            for item in z['stickerTextList']:
                                if type(item['stickerText']) is list:
                                    for st in item['stickerText']:
                                        stext.append(st)
                        elif 'stickersOnItem' in z.keys():
                            for item in z['stickersOnItem']:
                                if 'stickerText' in z.keys():
                                    if type(item['stickerText']) is list:
                                        for st in item['stickerText']:
                                            stext.append(st)
                                    else:
                                        stext.append(item['stickerText'])
                        text=z['desc']
                    elif 'itemInfos' in z.keys():
                        id = z['itemInfos']['id']

                        if 'stickerTextList' in z.keys():
                            for item in z['stickerTextList']:
                                if type(item['stickerText']) is list:
                                    for st in item['stickerText']:
                                        stext.append(st)
                        elif 'stickersOnItem' in z.keys():
                            for item in z['stickersOnItem']:
                                if 'stickerText' in z.keys():
                                    if type(item['stickerText']) is list:
                                        for st in item['stickerText']:
                                            stext.append(st)
                                    else:
                                        stext.append(item['stickerText'])
                        text=z['itemInfos']['text']
                    ranking += 1
                    if ranking > 2000:
                        break
                    if os.path.exists('F:\\Tiktok\\Hashtag\\' + hashtag + '\\' + id + '.mp4') and id not in outdict.keys() and id in labels.keys():
                        outdict[id]=text
                        temp=obj
                        temp['_id']=id
                        temp['text_feature']['text']=text
                        temp['text_feature']['stickerText']=stext
                        try:
                            db[hashtag].insert_one(temp)
                            print('insert successful',id)
                        except pymongo.errors.DuplicateKeyError:
                            print(id, 'duplicate')
                        i+=1